1. Carga y diagnóstico
inicial
ruta_datos <- if (file.exists("data/health_lifestyle_classification.csv")) {
"data/health_lifestyle_classification.csv"
} else {
file.path("..", "data", "health_lifestyle_classification.csv")
}
datos <- readr::read_csv(ruta_datos, show_col_types = FALSE) |>
janitor::clean_names()
# Tipificación básica de ordinales (si existen)
ord_levels <- list(
sleep_quality = c("Poor", "Fair", "Good", "Excellent"),
alcohol_consumption = c("None", "Occasionally", "Regularly"),
smoking_level = c("Non-smoker", "Light", "Moderate", "Heavy"),
healthcare_access = c("Poor", "Moderate", "Good", "Excellent"),
device_usage = c("Low", "Moderate", "High")
)
for (nm in names(ord_levels)) {
if (nm %in% names(datos)) datos[[nm]] <- factor(datos[[nm]], levels = ord_levels[[nm]], ordered = TRUE)
}
datos_resumen <- tibble(
registros = nrow(datos),
variables = ncol(datos),
categ = sum(map_chr(datos, ~ class(.x)[1]) %in% c("character", "factor", "ordered")),
cuant = sum(map_chr(datos, ~ class(.x)[1]) %in% c("numeric", "integer", "double"))
)
readr::write_csv(datos_resumen, file.path("..", "tab", "tipologia.csv"))
render_table(datos_resumen, caption = "Dimensión y tipología de la base")
Dimensión y tipología de la base
|
registros
|
variables
|
categ
|
cuant
|
|
1e+05
|
48
|
18
|
30
|
missing_tbl <- datos |>
summarise(across(everything(), ~ sum(is.na(.)))) |>
pivot_longer(everything(), names_to = "variable", values_to = "faltantes") |>
mutate(pct = round(faltantes / nrow(datos), 4)) |>
arrange(desc(faltantes))
readr::write_csv(missing_tbl, file.path("..", "tab", "na_by_var.csv"))
render_table(head(missing_tbl, 15), caption = "Variables con mayor cantidad de NA")
Variables con mayor cantidad de NA
|
variable
|
faltantes
|
pct
|
|
insulin
|
15836
|
0.1584
|
|
heart_rate
|
14003
|
0.1400
|
|
alcohol_consumption
|
13910
|
0.1391
|
|
gene_marker_flag
|
10474
|
0.1047
|
|
income
|
8470
|
0.0847
|
|
daily_steps
|
8329
|
0.0833
|
|
blood_pressure
|
7669
|
0.0767
|
|
survey_code
|
0
|
0.0000
|
|
age
|
0
|
0.0000
|
|
gender
|
0
|
0.0000
|
|
height
|
0
|
0.0000
|
|
weight
|
0
|
0.0000
|
|
bmi
|
0
|
0.0000
|
|
bmi_estimated
|
0
|
0.0000
|
|
bmi_scaled
|
0
|
0.0000
|
Insight: la mayoría de las variables no tiene
valores faltantes; los mayores porcentajes se concentran en
insulin, daily_steps,
exercise_type y alcohol_consumption, por lo
que cualquier modelado futuro debe considerar imputación o análisis
segmentado.
2. Variables
categóricas
cat_vars <- datos |> select(where(~ is.character(.x) || inherits(.x, "factor") || inherits(.x, "ordered"))) |> names()
cat_vars_principales <- intersect(c("gender", "target", "education_level", "diet_type"), cat_vars)
2.1 Tablas de
frecuencia
frecuencias_cat <- datos |>
select(all_of(cat_vars_principales)) |>
pivot_longer(everything(), names_to = "variable", values_to = "categoria") |>
filter(!is.na(categoria)) |>
count(variable, categoria) |>
group_by(variable) |>
mutate(pct = round(n / sum(n), 3))
readr::write_csv(frecuencias_cat, file.path("..", "tab", "freq_all_cats.csv"))
render_table(frecuencias_cat, caption = "Frecuencias y proporciones por categoría")
Frecuencias y proporciones por categoría
|
variable
|
categoria
|
n
|
pct
|
|
diet_type
|
Keto
|
24764
|
0.248
|
|
diet_type
|
Omnivore
|
25089
|
0.251
|
|
diet_type
|
Vegan
|
25122
|
0.251
|
|
diet_type
|
Vegetarian
|
25025
|
0.250
|
|
education_level
|
Bachelor
|
25363
|
0.254
|
|
education_level
|
High School
|
25028
|
0.250
|
|
education_level
|
Master
|
24992
|
0.250
|
|
education_level
|
PhD
|
24617
|
0.246
|
|
gender
|
Female
|
49868
|
0.499
|
|
gender
|
Male
|
50132
|
0.501
|
|
target
|
diseased
|
29903
|
0.299
|
|
target
|
healthy
|
70097
|
0.701
|
Insight: target muestra distribución
70% healthy vs. 30% diseased; gender y
education_level están relativamente balanceadas con ligera
concentración en niveles altos de educación.
2.2 Gráficos
univariados
plot_gender <- datos |>
filter(!is.na(gender)) |>
count(gender) |>
ggplot(aes(x = gender, y = n, fill = gender)) +
geom_col(show.legend = FALSE) +
geom_text(aes(label = n), vjust = -0.2) +
labs(title = "Distribución de género", x = NULL, y = "Frecuencia") +
theme_minimal()
plot_gender

plot_target <- datos |>
count(target) |>
ggplot(aes(x = target, y = n, fill = target)) +
geom_col(show.legend = FALSE) +
geom_text(aes(label = n), vjust = -0.2) +
labs(title = "Distribución de target", x = NULL, y = "Frecuencia") +
theme_minimal()
plot_target

plot_education <- datos |>
filter(!is.na(education_level)) |>
count(education_level) |>
ggplot(aes(x = education_level, y = n)) +
geom_col(fill = "#2A9D8F") +
geom_text(aes(label = n), vjust = -0.2) +
labs(title = "Niveles educativos declarados", x = NULL, y = "Frecuencia") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 30, hjust = 1))
plot_education

plot_diet <- datos |>
filter(!is.na(diet_type)) |>
count(diet_type) |>
ggplot(aes(x = diet_type, y = n)) +
geom_col(fill = "#F4A261") +
geom_text(aes(label = n), vjust = -0.2) +
labs(title = "Tipo de dieta reportada", x = NULL, y = "Frecuencia") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 30, hjust = 1))
plot_diet

2.3 Cruces
categóricos
ct_gender_target <- datos |>
filter(!is.na(gender), !is.na(target)) |>
count(gender, target) |>
group_by(gender) |>
mutate(prop = n / sum(n))
render_table(ct_gender_target, caption = "Tabla gender x target", html_font_size = 9)
Tabla gender x target
|
gender
|
target
|
n
|
prop
|
|
Female
|
diseased
|
14933
|
0.2994505
|
|
Female
|
healthy
|
34935
|
0.7005495
|
|
Male
|
diseased
|
14970
|
0.2986117
|
|
Male
|
healthy
|
35162
|
0.7013883
|
plot_gender_target <- ct_gender_target |>
ggplot(aes(x = gender, y = prop, fill = target)) +
geom_col(position = "fill") +
scale_y_continuous(labels = scales::percent) +
labs(title = "Proporciones de target por género", x = NULL, y = "Proporción") +
theme_minimal()
plot_gender_target

if ("education_level" %in% names(datos)) {
ct_edu_target <- datos |>
filter(!is.na(education_level), !is.na(target)) |>
count(education_level, target) |>
group_by(education_level) |>
mutate(prop = n / sum(n))
render_table(ct_edu_target, caption = "Tabla education_level x target", html_font_size = 9)
}
Tabla education_level x target
|
education_level
|
target
|
n
|
prop
|
|
Bachelor
|
diseased
|
7516
|
0.2963372
|
|
Bachelor
|
healthy
|
17847
|
0.7036628
|
|
High School
|
diseased
|
7530
|
0.3008630
|
|
High School
|
healthy
|
17498
|
0.6991370
|
|
Master
|
diseased
|
7507
|
0.3003761
|
|
Master
|
healthy
|
17485
|
0.6996239
|
|
PhD
|
diseased
|
7350
|
0.2985742
|
|
PhD
|
healthy
|
17267
|
0.7014258
|
if (exists("ct_edu_target")) {
plot_edu_target <- ct_edu_target |>
ggplot(aes(x = education_level, y = prop, fill = target)) +
geom_col() +
scale_y_continuous(labels = scales::percent) +
labs(title = "Proporciones de target por nivel educativo", x = "Nivel educativo", y = "Proporción") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 30, hjust = 1))
plot_edu_target
}

Insight: las proporciones de target
varían poco entre géneros, pero aumentan ligeramente las etiquetas
healthy en niveles educativos superiores.
3. Variables
cuantitativas
num_vars <- datos |> select(where(is.numeric)) |> names()
num_clave <- intersect(c("age", "bmi", "sleep_hours", "daily_steps", "stress_level", "mental_health_score"), num_vars)
3.1 Estadísticos
descriptivos
resumen_cuant <- datos |>
select(all_of(num_clave)) |>
pivot_longer(everything(), names_to = "variable", values_to = "valor") |>
group_by(variable) |>
summarise(
n = sum(!is.na(valor)),
media = mean(valor, na.rm = TRUE),
mediana = median(valor, na.rm = TRUE),
sd = sd(valor, na.rm = TRUE),
q1 = quantile(valor, 0.25, na.rm = TRUE),
q3 = quantile(valor, 0.75, na.rm = TRUE),
asimetria = psych::skew(valor, na.rm = TRUE),
curtosis = psych::kurtosi(valor, na.rm = TRUE)
)
readr::write_csv(resumen_cuant, file.path("..", "tab", "num_summary.csv"))
render_table(resumen_cuant, caption = "Estadísticos clave por variable cuantitativa")
Estadísticos clave por variable cuantitativa
|
variable
|
n
|
media
|
mediana
|
sd
|
q1
|
q3
|
asimetria
|
curtosis
|
|
age
|
100000
|
48.525990
|
48.000000
|
17.886768
|
33.000000
|
64.000000
|
0.0012578
|
-1.2012359
|
|
bmi
|
100000
|
24.493876
|
24.156734
|
5.951069
|
20.271405
|
28.258696
|
0.4206923
|
0.2137246
|
|
daily_steps
|
91671
|
7012.925748
|
7004.285450
|
2488.989356
|
5320.858377
|
8702.281301
|
0.0524075
|
-0.1437885
|
|
mental_health_score
|
100000
|
5.004680
|
5.000000
|
3.164228
|
2.000000
|
8.000000
|
0.0017028
|
-1.2207515
|
|
sleep_hours
|
100000
|
7.002008
|
6.998164
|
1.496821
|
5.986781
|
8.019219
|
0.0251318
|
-0.1227913
|
|
stress_level
|
100000
|
4.991600
|
5.000000
|
3.154997
|
2.000000
|
8.000000
|
-0.0004401
|
-1.2121821
|
Insight: daily_steps presenta mayor
asimetría positiva; bmi se mantiene próximo a simetría con
mediana ligeramente inferior a la media.
3.2 Histogramas y
boxplots (uno por figura)
datos |> ggplot(aes(x = age)) +
geom_histogram(bins = 30, fill = "#2A9D8F", color = "white") +
labs(title = "Histograma edad", x = "Edad", y = "Frecuencia") +
theme_minimal()

datos |> ggplot(aes(y = age)) +
geom_boxplot(fill = "#2A9D8F", alpha = 0.8) +
labs(title = "Boxplot edad", y = "Edad") +
theme_minimal()

datos |> ggplot(aes(x = bmi)) +
geom_histogram(bins = 30, fill = "#E76F51", color = "white") +
labs(title = "Histograma BMI", x = "BMI", y = "Frecuencia") +
theme_minimal()

datos |> ggplot(aes(y = bmi)) +
geom_boxplot(fill = "#E76F51", alpha = 0.8) +
labs(title = "Boxplot BMI", y = "BMI") +
theme_minimal()

datos |> ggplot(aes(x = daily_steps)) +
geom_histogram(bins = 30, fill = "#264653", color = "white") +
labs(title = "Histograma daily_steps", x = "Pasos diarios", y = "Frecuencia") +
theme_minimal()

datos |> ggplot(aes(y = daily_steps)) +
geom_boxplot(fill = "#264653", alpha = 0.8) +
labs(title = "Boxplot daily_steps", y = "Pasos diarios") +
theme_minimal()

3.3 Correlaciones y
dispersión
cor_vars <- intersect(c("age", "bmi", "height", "weight", "sleep_hours", "daily_steps"), num_vars)
cmat <- cor(datos[cor_vars], use = "pairwise.complete.obs")
corrplot::corrplot(cmat, method = "color", type = "upper", addCoef.col = "black", tl.col = "black")

datos |> ggplot(aes(x = height, y = weight)) +
geom_point(alpha = 0.4) +
geom_smooth(method = "lm", se = FALSE, color = "red") +
labs(title = "Relación altura vs peso") +
theme_minimal()

datos |> ggplot(aes(x = weight, y = bmi)) +
geom_point(alpha = 0.4) +
geom_smooth(method = "lm", se = FALSE, color = "red") +
labs(title = "Relación peso vs BMI") +
theme_minimal()

datos |> ggplot(aes(x = age, y = bmi)) +
geom_point(alpha = 0.4) +
geom_smooth(method = "lm", se = FALSE, color = "red") +
labs(title = "Relación edad vs BMI") +
theme_minimal()

Insight: peso y altura muestran correlación fuerte
con BMI, mientras que edad aporta variación moderada.
3.4 Normalidad y
sugerencia distribucional
normalidad <- map_df(c("age", "bmi", "sleep_hours", "daily_steps"), function(var) {
if (!var %in% names(datos)) return(NULL)
valores <- datos[[var]][!is.na(datos[[var]])]
if (length(valores) > 5000) valores <- sample(valores, 5000)
sw <- shapiro.test(valores)
tibble(variable = var, shapiro_w = sw$statistic, shapiro_p = sw$p.value)
})
render_table(normalidad, caption = "Prueba de normalidad Shapiro-Wilk")
Prueba de normalidad Shapiro-Wilk
|
variable
|
shapiro_w
|
shapiro_p
|
|
age
|
0.9550999
|
0.0000000
|
|
bmi
|
0.9894079
|
0.0000000
|
|
sleep_hours
|
0.9989825
|
0.0038962
|
|
daily_steps
|
0.9981960
|
0.0000157
|
dist_sugerida <- resumen_cuant |>
mutate(
sugerencia = case_when(
abs(asimetria) < 0.3 & abs(curtosis) < 1 ~ "Normal",
asimetria > 0.8 ~ "Gamma / Log-normal",
asimetria < -0.8 ~ "Distribución sesgada a la izquierda",
TRUE ~ "Ver análisis gráfico"
)
) |> select(variable, asimetria, curtosis, sugerencia)
render_table(dist_sugerida, caption = "Sugerencia distribucional por variable")
Sugerencia distribucional por variable
|
variable
|
asimetria
|
curtosis
|
sugerencia
|
|
age
|
0.0012578
|
-1.2012359
|
Ver análisis gráfico
|
|
bmi
|
0.4206923
|
0.2137246
|
Ver análisis gráfico
|
|
daily_steps
|
0.0524075
|
-0.1437885
|
Normal
|
|
mental_health_score
|
0.0017028
|
-1.2207515
|
Ver análisis gráfico
|
|
sleep_hours
|
0.0251318
|
-0.1227913
|
Normal
|
|
stress_level
|
-0.0004401
|
-1.2121821
|
Ver análisis gráfico
|
4. Análisis mixto
(cuantitativa vs categórica)
bmi_stats <- datos |>
filter(!is.na(bmi), !is.na(gender)) |>
group_by(gender) |>
summarise(
n = n(),
media = mean(bmi),
sd = sd(bmi),
mediana = median(bmi),
q1 = quantile(bmi, 0.25),
q3 = quantile(bmi, 0.75),
.groups = "drop"
)
readr::write_csv(bmi_stats, file.path("..", "tab", "num_by_group_bmi_gender.csv"))
render_table(bmi_stats, caption = "BMI por género")
BMI por género
|
gender
|
n
|
media
|
sd
|
mediana
|
q1
|
q3
|
|
Female
|
49868
|
24.52000
|
5.925752
|
24.20467
|
20.30653
|
28.26678
|
|
Male
|
50132
|
24.46789
|
5.976092
|
24.11126
|
20.22856
|
28.24639
|
datos |>
filter(!is.na(bmi), !is.na(gender)) |>
ggplot(aes(x = gender, y = bmi, fill = gender)) +
geom_boxplot(alpha = 0.8, show.legend = FALSE) +
labs(title = "Distribución de BMI por género", x = NULL, y = "BMI") +
theme_minimal()

age_target <- datos |>
filter(!is.na(age), !is.na(target)) |>
group_by(target) |>
summarise(
n = n(),
media = mean(age),
sd = sd(age),
mediana = median(age),
q1 = quantile(age, 0.25),
q3 = quantile(age, 0.75),
.groups = "drop"
)
readr::write_csv(age_target, file.path("..", "tab", "num_by_group_age_target.csv"))
render_table(age_target, caption = "Edad segmentada por target")
Edad segmentada por target
|
target
|
n
|
media
|
sd
|
mediana
|
q1
|
q3
|
|
diseased
|
29903
|
48.71635
|
17.88678
|
49
|
33
|
64
|
|
healthy
|
70097
|
48.44478
|
17.88627
|
48
|
33
|
64
|
datos |>
filter(!is.na(age), !is.na(target)) |>
ggplot(aes(x = target, y = age, fill = target)) +
geom_boxplot(alpha = 0.8, show.legend = FALSE) +
labs(title = "Edad por condición de salud", x = NULL, y = "Edad") +
theme_minimal()

Insight: las distribuciones de BMI y edad sugieren
diferencias pequeñas entre grupos, aunque estadísticamente detectables
dada la muestra grande.
5. Pruebas de
hipótesis
if ("bmi" %in% names(datos)) {
bmi_vec <- datos$bmi[!is.na(datos$bmi)]
prueba_media <- t.test(bmi_vec, mu = 25)
tamaño_efecto <- effectsize::cohens_d(bmi_vec, mu = 25)
resultado_media <- tibble(
hipotesis_nula = "mu = 25",
media_muestral = mean(bmi_vec),
estadistico_t = prueba_media$statistic,
gl = prueba_media$parameter,
p_valor = prueba_media$p.value,
ic95_li = prueba_media$conf.int[1],
ic95_ls = prueba_media$conf.int[2],
cohens_d = tamaño_efecto$Cohens_d
)
readr::write_csv(resultado_media, file.path("..", "tab", "test_mean_bmi.csv"))
render_table(resultado_media, caption = "Prueba t una muestra para BMI", html_font_size = 9)
}
Prueba t una muestra para BMI
|
hipotesis_nula
|
media_muestral
|
estadistico_t
|
gl
|
p_valor
|
ic95_li
|
ic95_ls
|
cohens_d
|
|
mu = 25
|
24.49388
|
-26.89442
|
99999
|
0
|
24.45699
|
24.53076
|
-0.0850476
|
if ("target" %in% names(datos)) {
exito <- sum(datos$target == "healthy", na.rm = TRUE)
total <- sum(!is.na(datos$target))
prueba_prop <- prop.test(exito, total, p = 0.5, correct = FALSE)
cohens_h <- 2 * asin(sqrt(exito / total)) - 2 * asin(sqrt(0.5))
resultado_prop <- tibble(
hipotesis_nula = "p = 0.5",
proporcion_muestral = exito / total,
estadistico_chi2 = prueba_prop$statistic,
p_valor = prueba_prop$p.value,
ic95_li = prueba_prop$conf.int[1],
ic95_ls = prueba_prop$conf.int[2],
cohens_h = cohens_h
)
readr::write_csv(resultado_prop, file.path("..", "tab", "test_prop_target.csv"))
render_table(resultado_prop, caption = "Prueba de proporción para healthy", html_font_size = 9)
}
Prueba de proporción para healthy
|
hipotesis_nula
|
proporcion_muestral
|
estadistico_chi2
|
p_valor
|
ic95_li
|
ic95_ls
|
cohens_h
|
|
p = 0.5
|
0.70097
|
16155.58
|
0
|
0.6981247
|
0.7037999
|
0.4136345
|
Insight: la media de BMI supera ligeramente 25 con
un tamaño de efecto pequeño; la proporción de individuos
healthy es mayor que 0.5 con diferencia práctica moderada
dado el tamaño muestral.
5.1 Candidatas de
prueba (media) y selección
mean_candidates <- tibble::tribble(
~var, ~mu0, ~just, ~units,
"sleep_hours", 7, "Referencia higiene del sueño", "horas",
"bmi", 25, "Umbral normal-superior BMI", "kg/m^2",
"blood_pressure",120, "Sistólica media saludable", "mmHg"
) |>
dplyr::filter(var %in% names(datos)) |>
rowwise() |>
mutate(
y = list(stats::na.omit(datos[[var]])),
n = length(y),
media = mean(y),
sd = stats::sd(y),
t_p = tryCatch(stats::t.test(y, mu = mu0)$p.value, error = function(e) NA_real_),
d = ifelse(sd > 0, (media - mu0)/sd, NA_real_),
power_approx = tryCatch(stats::power.t.test(n = n, delta = abs(media - mu0), sd = sd, sig.level = 0.05, type = "one.sample")$power, error = function(e) NA_real_)
) |>
ungroup() |>
select(var, mu0, just, units, n, media, sd, d, t_p, power_approx) |>
arrange(dplyr::desc(abs(d)))
readr::write_csv(mean_candidates, file.path("..", "tab", "test_mean_candidates.csv"))
render_table(mean_candidates, caption = "Candidatas prueba de media (ordenadas por |d|)", html_font_size = 9)
Candidatas prueba de media (ordenadas por |d|)
|
var
|
mu0
|
just
|
units
|
n
|
media
|
sd
|
d
|
t_p
|
power_approx
|
|
bmi
|
25
|
Umbral normal-superior BMI
|
kg/m^2
|
100000
|
24.493876
|
5.951069
|
-0.0850476
|
0.0000000
|
1.0000000
|
|
sleep_hours
|
7
|
Referencia higiene del sueño
|
horas
|
100000
|
7.002008
|
1.496821
|
0.0013417
|
0.6713670
|
0.0623067
|
|
blood_pressure
|
120
|
Sistólica media saludable
|
mmHg
|
92331
|
119.980149
|
15.015503
|
-0.0013220
|
0.6878954
|
0.0595865
|
selected_mean <- mean_candidates |> slice_head(n = 1)
selected_mean
5.2 Candidatas de
prueba (proporción) y selección
prop_candidates <- tibble::tribble(
~var, ~level, ~p0, ~just,
"target", "healthy", 0.50, "Balance operativo",
"smoking_level", "Heavy", 0.08, "Referencia de tabaquismo pesado",
"alcohol_consumption","Regularly", 0.20, "Consumo regular esperado"
) |>
dplyr::filter(var %in% names(datos)) |>
rowwise() |>
mutate(
x = sum(stats::na.omit(datos[[var]]) == level),
n = sum(!is.na(datos[[var]])),
p_hat = x/n,
test = list(tryCatch(stats::prop.test(x, n, p = p0, correct = FALSE), error = function(e) NULL)),
p_valor = ifelse(is.null(test), NA_real_, test$p.value),
ic_li = ifelse(is.null(test), NA_real_, test$conf.int[1]),
ic_ls = ifelse(is.null(test), NA_real_, test$conf.int[2]),
h = 2*asin(sqrt(p_hat)) - 2*asin(sqrt(p0))
) |>
ungroup() |>
select(var, level, p0, just, n, x, p_hat, h, p_valor, ic_li, ic_ls) |>
arrange(dplyr::desc(abs(h)))
readr::write_csv(prop_candidates, file.path("..", "tab", "test_prop_candidates.csv"))
render_table(prop_candidates, caption = "Candidatas prueba de proporción (ordenadas por |h|)", html_font_size = 9)
Candidatas prueba de proporción (ordenadas por |h|)
|
var
|
level
|
p0
|
just
|
n
|
x
|
p_hat
|
h
|
p_valor
|
ic_li
|
ic_ls
|
|
smoking_level
|
Heavy
|
0.08
|
Referencia de tabaquismo pesado
|
100000
|
33208
|
0.3320800
|
0.6547863
|
0
|
0.3291675
|
0.3350054
|
|
target
|
healthy
|
0.50
|
Balance operativo
|
100000
|
70097
|
0.7009700
|
0.4136345
|
0
|
0.6981247
|
0.7037999
|
|
alcohol_consumption
|
Regularly
|
0.20
|
Consumo regular esperado
|
86090
|
28782
|
0.3343245
|
0.3057661
|
0
|
0.3311807
|
0.3374832
|
selected_prop <- prop_candidates |> slice_head(n = 1)
selected_prop
6. Densidades por
categoría (figuras separadas)
if (all(c("bmi","gender") %in% names(datos))) {
ggplot(datos, aes(x = bmi, fill = gender)) +
geom_density(alpha = 0.4) +
labs(title = "Densidad de BMI por género", x = "BMI", y = "Densidad") +
theme_minimal()
}

if (all(c("sleep_hours","target") %in% names(datos))) {
ggplot(datos, aes(x = sleep_hours, fill = target)) +
geom_density(alpha = 0.4) +
labs(title = "Densidad de horas de sueño por target", x = "Horas de sueño", y = "Densidad") +
theme_minimal()
}

6. Diccionario de
variables (resumen)
if (file.exists("data/diccionario_variables.md")) {
cat(readr::read_file("data/diccionario_variables.md"))
} else {
message("Archivo de diccionario no encontrado.")
}